In [1]:
import os
from urllib import urlretrieve
import graphlab
In [2]:
URL = 'https://d396qusza40orc.cloudfront.net/phoenixassets/people_wiki.csv'
In [3]:
def get_data(filename='people_wiki.csv', url=URL, force_download=False):
"""Download and cache the fremont data
Parameters
----------
filename: string (optional)
location to save the data
url: string (optional)
force_download: bool (optional)
if True, force redownload of data
Returns
-------
data: graphlab SFrame. Similer to a pandas DataFrame,
but with capacity for faster analysis of larger data sets
"""
if force_download or not os.path.exists(filename):
urlretrieve(url, filename)
sf = graphlab.SFrame('people_wiki.csv')
return sf
In [4]:
people = get_data()
people.head()
In [40]:
# Look at some individual article text
obama = people[people['name'] == 'Barack Obama']
obama['text']
Out[40]:
In [7]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
Out[7]:
In [9]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
obama['word_count']
Out[9]:
In [41]:
# stack the word_count field (dict) in order to create a table for sorting. This will show the most common words
obama_wordcount_table = obama[['word_count']].stack('word_count', new_column_name=['word', 'count'])
obama_wordcount_table.sort('count', ascending=False)
Out[41]:
The most common words are "the", "in", etc. which are not important. We'll use TF-IDF to fix this problem.
Number of times a word appears in the article
Log of total # of articles divided by # of articles containing the word.
log(Total # articles / # articles containing word)
In [17]:
# add word count field
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
In [22]:
#use graphlab's built in tfidf, which uses calculation above
people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])
In [23]:
people.head()
Out[23]:
In [42]:
# look at tfidf for Obama
obama = people[people['name']=='Barack Obama']
obama[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)
Out[42]:
These words are much more relevant than the ones from before.
In [44]:
# look at two more people
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']
Tswift = people[people['name'] == 'Taylor Swift']
In [29]:
# calculate distance between two articles using cosine distance. Min is 0, max distance is 1.
graphlab.distances.cosine(obama['tfidf'][0], clinton['tfidf'][0])
Out[29]:
In [46]:
graphlab.distances.cosine(obama['tfidf'][0], Tswift['tfidf'][0])
Out[46]:
In [34]:
knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')
In [47]:
# closest Wikipedia articles to Obama
knn_model.query(obama)
Out[47]:
In [48]:
knn_model.query(Tswift)
Out[48]: